Diagnosing Model Driven Telemetry timeseries¶

This notebook loads an MDT dataset, visualizes it using t-SNE and uses DBSCAN to detect clusters and associated state transitions ("change-points").

In [1]:
%load_ext autoreload
%autoreload 2

Load dataset information¶

In [2]:
import modules.dataset as ds
ds.extract_dataset('./datasets/mdt-demo.tgz', './output')
In [3]:
import modules.mdt.datasets as mdt_ds

datasets = mdt_ds.Datasets(datasets_dir='./output')
datasets.jupyter_select_dataset_device(select_file=False)

Available Datasets:


mdt-demo


Box(children=(Dropdown(description='Dataset:', layout=Layout(display='flex', justify_content='flex-start', wid…

Show Dataset Sample¶

In [4]:
import pandas as pd
import modules.utils as utils

data_fn, _ = datasets.get_input_data_file("preprocessed_offline.csv")

df = pd.read_csv(open(data_fn, 'rb'))  

utils.displayDataFrame(df.iloc[0:19,0:9])
ts n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-good-bytes n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-good-frames n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-multicast-frames n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-bytes n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-frames n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from1024-to1518 n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from128-to255 n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from1519-to-max
1558249381.658611 0.681327 0.687531 0.504115 0.681327 0.687531 0.451305 0.858389 0.681585
1558249391.658611 0.681327 0.687531 0.504115 0.681327 0.687531 0.451305 0.858389 0.681585
1558249401.658611 0.644663 0.648323 0.258243 0.644663 0.648323 0.517278 0.741312 0.644928
1558249411.658611 0.626289 0.628532 0.129121 0.626289 0.628532 0.558469 0.677508 0.626523
1558249421.658611 0.616965 0.618757 0.294610 0.616965 0.618757 0.586249 0.653254 0.617207
1558249431.658611 0.608525 0.610206 0.169590 0.608525 0.610206 0.606070 0.636611 0.608696
1558249441.658611 0.607697 0.609107 0.314231 0.607697 0.609107 0.637078 0.624820 0.607856
1558249451.658611 0.605199 0.606604 0.409198 0.605199 0.606604 0.633205 0.620602 0.605310
1558249461.658611 0.617882 0.619045 0.227750 0.617882 0.619045 0.648154 0.627273 0.618074
1558249471.658611 0.604494 0.605476 0.113875 0.604494 0.605476 0.626065 0.611651 0.604676
1558249481.658611 0.606327 0.607145 0.285342 0.606327 0.607145 0.636013 0.609389 0.606486
1558249491.658611 0.604294 0.604613 0.166601 0.604294 0.604613 0.650453 0.591848 0.604305
1558249501.658611 0.581812 0.580794 0.311120 0.581812 0.580794 0.604260 0.516707 0.581356
1558249511.658611 0.586338 0.585610 0.407645 0.586338 0.585610 0.604076 0.529965 0.585930
1558249521.658611 0.597648 0.597462 0.228587 0.597648 0.597462 0.608988 0.568426 0.597505
1558249531.658611 0.617345 0.617636 0.114293 0.617345 0.617636 0.640904 0.606600 0.617392
1558249541.658611 0.606407 0.606873 0.282862 0.606407 0.606873 0.631316 0.605719 0.606531
1558249551.658611 0.607281 0.607856 0.168050 0.607281 0.607856 0.649228 0.612613 0.607421
1558249561.658611 0.612705 0.613097 0.309156 0.612705 0.613097 0.654539 0.613460 0.612936

Helper functions (load data)¶

In [5]:
import re
from datetime import datetime, timezone


import numpy as np

MIN_TIMESTAMP = -62135596800
MAX_TIMESTAMP = 253402214400

ORIGINAL_DATA     = "original data"
REDUCED_DATA      = "reduced data"
FIRST_DERIVATIVE  = "first derivative"
SECOND_DERIVATIVE = "second derivative"

def get_feature_names_bis(path, delimiter=','):
    "a more direct and simpler implementation than get_feature_names()"
    with open(path, "r") as f:
        header = f.readline().strip('\n')
    return header.split(delimiter)

def scale_data(d):
    d = d - np.mean(d, axis=0)
    ft_scale = np.std(d, axis=0)
    z_index = np.where(ft_scale < 1e-6)
    ft_scale[z_index] = 1
    d = d / ft_scale
    return d

def load_data(in_fn, reduced=None, startTime=MIN_TIMESTAMP, endTime=MAX_TIMESTAMP, 
              scale=False, data_selection={}, ft_regex=None, remove_nan=False, remove_inf=False) -> (np.array, pd.DataFrame):
    data = np.genfromtxt(in_fn, dtype=float, delimiter=',', skip_header=1)

    if isinstance(data_selection, str):
        selection = {
            ORIGINAL_DATA    : False,
            REDUCED_DATA     : False,
            FIRST_DERIVATIVE : False,
            SECOND_DERIVATIVE: False
        }
        selection[data_selection] = True
        data_selection = selection

    tstp = data[:,0]
    data = data[:,1:]
    ft_names = np.asarray(get_feature_names_bis(in_fn)[1:])
    if ft_regex:
        ft_filter = re.compile(ft_regex, re.IGNORECASE)
        ft_idx = np.array([i for i, v in enumerate(map(ft_filter.match, ft_names)) if v is not None])
        if len(ft_idx) > 0:
            data = data[:, ft_idx]
            ft_names = ft_names[ft_idx]
        else:
            data = np.array([])
            ft_names = np.array([])

    if remove_nan:
        inval_col = np.where(np.any(np.isnan(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)

    if remove_inf:
        inval_col = np.where(np.any(np.isinf(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)
    
    if scale:
        data = scale_data(data)
    
    final_names = np.asarray([])
    final_data = np.array([[] for _ in range(len(data))])
    derivative = None
    if data_selection[FIRST_DERIVATIVE] or data_selection[SECOND_DERIVATIVE]:
        derivative = np.diff(data, axis=0)

    if data_selection[ORIGINAL_DATA]:
        final_data = np.append(final_data, data, axis=1)
        final_names = np.append(final_names, ft_names)
    
    if data_selection[REDUCED_DATA]:
        final_data = np.append(final_data, reduced, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_reduced" for x in range(len(reduced[0]))])

    if data_selection[FIRST_DERIVATIVE]:
        final_data = np.append(final_data, np.vstack([derivative[0,:], derivative]), axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-send_deriv" for x in ft_names])

    if data_selection[SECOND_DERIVATIVE]:
        second_derivative = np.diff(derivative, axis=0)
        second_derivative = np.vstack([second_derivative[0,:], second_derivative[0,:], second_derivative])
        final_data = np.append(final_data, second_derivative, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_deriv2" for x in ft_names])

    # add timestamp            
    final_data = np.append(tstp.reshape(-1,1), final_data, axis=1)
    final_names = np.append(np.asarray('ts'), final_names)

    # filter by time
    if isinstance(startTime, datetime):
        startTime = startTime.replace(tzinfo=timezone.utc).timestamp()
    if isinstance(endTime, datetime):
        endTime = endTime.replace(tzinfo=timezone.utc).timestamp()
    final_data = final_data[
        (final_data[:,0] >= startTime) &
        (final_data[:,0] <= endTime)
    ]
    final_tstp = final_data[:,0]

    return final_tstp, pd.DataFrame(final_data, columns=final_names)

Detect Changepoints¶

In [6]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

max_data_point_distance = 0.05

tstp, dataframe = load_data(data_fn, scale=False, data_selection=ORIGINAL_DATA, ft_regex="^(?!.*(time|second)).*")

fulldata = dataframe.to_numpy(dtype=float)
tstp = fulldata[:,0]
data = fulldata[:,1:]

solver = TSNE(n_components=2, init='pca', random_state=0)
reduced = solver.fit_transform(data)

solver = DBSCAN(eps = max_data_point_distance)
clusters = solver.fit(MinMaxScaler().fit_transform(reduced)).labels_

changes = np.where(clusters[:-1] != clusters[1:])[0]
changepoints = []
for t in changes:
    changepoints.append(tstp[t])

print(changepoints)
[1558250581.658611, 1558251821.658611, 1558253001.658611, 1558254201.658611, 1558255381.658611, 1558256611.658611, 1558257801.658611, 1558258991.658611]

Show Changepoints¶

In [7]:
from modules.mdt.data_utils import plot_data_anime
import plotly.graph_objects as go

events = [
    {
        "timestamp": (tstp[t+1] + tstp[t])/2.0,
        "event": str(i+1),
        "device": datasets.get_device(),
        "interface": None
    } for i, t in enumerate(changes)]
plot_data, frames = plot_data_anime(reduced, tstp, events, color='rgb(128,177,211)')
fig = go.Figure(
        data = plot_data,
        layout = {
            'title': "tSNE 2-D Visualization",
            'autosize': False,
            'width': 1000,
            'height': 1000,
            'updatemenus': [{
                'buttons': [
                    {
                        'args': [None, {
                            'frame': {'duration': 100, 'redraw': False},
                            'fromcurrent': True, 'transition': {'duration': 50, 'easing': 'quadratic-in-out'}}],
                            'label': 'Go', 'method': 'animate'
                    },
                    {
                        'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                        'transition': {'duration': 0}}],
                        'label': 'Pause',
                        'method': 'animate'
                    }],
                'direction': 'left',
                'pad': {'r': 10, 't': 10},
                'showactive': False,
                'type': 'buttons',
                'x': 0.1,
                'xanchor': 'right',
                'y': 1,
                'yanchor': 'bottom'
            }]},
        frames = frames)

fig.show()
In [ ]: